{
 "nbformat": 4,
 "nbformat_minor": 0,
 "metadata": {
  "colab": {
   "provenance": []
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3"
  },
  "language_info": {
   "name": "python"
  }
 },
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "id": "7Pisdo9ZJKsJ"
   },
   "outputs": [],
   "source": [
    "%%capture\n",
    "!pip install langchain openai tiktoken lancedb\n",
    "!pip install  CTransformers\n",
    "!pip install sentence_transformers\n",
    "!pip install gradio"
   ]
  },
  {
   "cell_type": "code",
   "source": [
    "import os\n",
    "import re\n",
    "import openai\n",
    "import gradio as gr\n",
    "from typing import List, Union\n",
    "import lancedb\n",
    "import langchain\n",
    "from langchain.vectorstores import LanceDB\n",
    "from langchain.chains import RetrievalQA\n",
    "from langchain.llms import CTransformers\n",
    "from langchain.document_loaders import UnstructuredHTMLLoader\n",
    "from langchain.embeddings import HuggingFaceEmbeddings\n",
    "from langchain.document_loaders import PyPDFLoader\n",
    "from langchain.text_splitter import RecursiveCharacterTextSplitter\n",
    "from langchain.document_loaders import TextLoader\n",
    "from langchain.embeddings import OpenAIEmbeddings\n",
    "from langchain.chains import ConversationalRetrievalChain\n",
    "from langchain.memory import ConversationBufferMemory\n",
    "from langchain.chat_models import ChatOpenAI\n",
    "from langchain.embeddings import HuggingFaceBgeEmbeddings\n",
    "from langchain.document_loaders import WebBaseLoader\n",
    "from langchain.embeddings.openai import OpenAIEmbeddings\n",
    "from langchain.document_loaders import DirectoryLoader\n",
    "from langchain.document_loaders.csv_loader import CSVLoader"
   ],
   "metadata": {
    "id": "KVHLKQh6Jf7c"
   },
   "execution_count": 2,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "CZlzJ_FYJrzo"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [
    "from ctransformers import AutoModelForCausalLM\n",
    "\n",
    "\n",
    "class ChatbotHelper:\n",
    "    def __init__(self):\n",
    "        self.chatbot_instance = None\n",
    "        self.chat_history = []\n",
    "        self.chunks = None\n",
    "\n",
    "    def find_urls(self, text: str) -> List[str]:\n",
    "        url_pattern = re.compile(r\"https?://\\S+|www\\.\\S+\")\n",
    "        return url_pattern.findall(text)\n",
    "\n",
    "    def initialize_chatbot(self, urls: List[str]):\n",
    "        documents = self.load_website_content(urls)\n",
    "        chunks = self.split_text(documents)\n",
    "        embedder = self.bge_embedding(chunks)\n",
    "        vectorstore = self.create_vector_store(chunks, embedder)\n",
    "        retriever = self.create_retriever(vectorstore)\n",
    "        self.chatbot_instance = self.create_chatbot(retriever)\n",
    "        return \"Chatbot initialized! How can I assist you? now ask your Quetions\"\n",
    "\n",
    "    def load_website_content(self, urls):\n",
    "        print(\"Loading website(s) into Documents...\")\n",
    "        documents = WebBaseLoader(web_path=urls).load()\n",
    "        print(\"Done loading website(s).\")\n",
    "        return documents\n",
    "\n",
    "    def load_llm(self):\n",
    "        # llm = CTransformers(\n",
    "        #     model=\"mistral-7b-instruct-v0.1.Q5_K_M.gguf\",\n",
    "        #     model_type=\"mistral\"\n",
    "        # )\n",
    "        # return llm\n",
    "\n",
    "        # llm = AutoModelForCausalLM.from_pretrained(\"TheBloke/Mistral-7B-v0.1-GGUF\", model_file=\"mistral-7b-v0.1.Q4_K_M.gguf\", model_type=\"mistral\")\n",
    "\n",
    "        # return llm\n",
    "\n",
    "        llm = CTransformers(\n",
    "            model=\"TheBloke/Mistral-7B-v0.1-GGUF\",\n",
    "            model_file=\"mistral-7b-v0.1.Q4_K_M.gguf\",\n",
    "            model_type=\"mistral\",\n",
    "        )\n",
    "\n",
    "        return llm\n",
    "\n",
    "    def split_text(self, documents):\n",
    "        text_splitter = RecursiveCharacterTextSplitter(\n",
    "            chunk_size=120, chunk_overlap=20, length_function=len\n",
    "        )\n",
    "        chunks = text_splitter.transform_documents(documents)\n",
    "        print(\"Done splitting documents.\")\n",
    "        return chunks\n",
    "\n",
    "    def bge_embedding(self, chunks):\n",
    "        print(\"Creating bge embedder...\")\n",
    "        model_name = \"BAAI/bge-base-en\"\n",
    "        encode_kwargs = {\"normalize_embeddings\": True}\n",
    "        embedder = HuggingFaceBgeEmbeddings(\n",
    "            model_name=model_name,\n",
    "            model_kwargs={\"device\": \"cpu\"},\n",
    "            encode_kwargs=encode_kwargs,\n",
    "        )\n",
    "        return embedder\n",
    "\n",
    "    def create_vector_store(self, chunks, embedder):\n",
    "        print(\"Creating vectorstore...\")\n",
    "        db = lancedb.connect(\"/tmp/lancedb\")\n",
    "        table = db.create_table(\n",
    "            \"pdf_search\",\n",
    "            data=[\n",
    "                {\n",
    "                    \"vector\": embedder.embed_query(\"Hello World\"),\n",
    "                    \"text\": \"Hello World\",\n",
    "                    \"id\": \"1\",\n",
    "                }\n",
    "            ],\n",
    "            mode=\"overwrite\",\n",
    "        )\n",
    "        vectorstore = LanceDB.from_documents(chunks, embedder, connection=table)\n",
    "        return vectorstore\n",
    "\n",
    "    def create_retriever(self, vectorstore):\n",
    "        print(\"Creating vectorstore retriever...\")\n",
    "        retriever = vectorstore.as_retriever()\n",
    "        return retriever\n",
    "\n",
    "    def embed_user_query(self, query):\n",
    "        if self.chunks is None:\n",
    "            return \"Chatbot not initialized. Please provide a URL first.\"\n",
    "        core_embeddings_model = self.bge_embedding(self.chunks)\n",
    "        embedded_query = core_embeddings_model.embed_query(query)\n",
    "        return embedded_query\n",
    "\n",
    "    def create_chatbot(self, retriever):\n",
    "        llm = self.load_llm()\n",
    "        memory = ConversationBufferMemory(\n",
    "            memory_key=\"chat_history\", return_messages=True\n",
    "        )\n",
    "        conversation_chain = ConversationalRetrievalChain.from_llm(\n",
    "            llm=llm, retriever=retriever, memory=memory\n",
    "        )\n",
    "        return conversation_chain\n",
    "\n",
    "    def chat(self, conversation_chain, input):\n",
    "        return conversation_chain.run(input)\n",
    "\n",
    "    def respond(self, message):\n",
    "        if message.lower() == \"clear\":\n",
    "            self.chatbot_instance = None\n",
    "            self.chat_history.clear()\n",
    "            return \"\", self.chat_history\n",
    "\n",
    "        urls = self.find_urls(message)\n",
    "\n",
    "        if not self.chatbot_instance and urls:\n",
    "            bot_message = self.initialize_chatbot(urls)\n",
    "        else:\n",
    "            if self.chatbot_instance:\n",
    "                bot_message = self.chat(self.chatbot_instance, message)\n",
    "            else:\n",
    "                bot_message = \"Please provide a URL to initialize the chatbot first, then ask any questions related to that site.\"\n",
    "\n",
    "        self.chat_history.append((message, bot_message))\n",
    "        chat_history_text = \"\\n\".join(\n",
    "            [f\"User: {msg[0]}\\nBot: {msg[1]}\\n\" for msg in self.chat_history]\n",
    "        )\n",
    "        return bot_message\n",
    "\n",
    "    def run_interface(self):\n",
    "        iface = gr.Interface(\n",
    "            fn=self.respond,\n",
    "            title=\"Chatbot with URL or any website \",\n",
    "            inputs=gr.Textbox(\n",
    "                label=\"Your Query\", placeholder=\"Type your query here...\", lines=5\n",
    "            ),\n",
    "            outputs=[\n",
    "                gr.Textbox(\n",
    "                    label=\"Chatbot Response\",\n",
    "                    type=\"text\",\n",
    "                    default=\"Chatbot response will appear here.\",\n",
    "                    lines=10,\n",
    "                )\n",
    "            ],\n",
    "        )\n",
    "        iface.launch()\n",
    "\n",
    "\n",
    "if __name__ == \"__main__\":\n",
    "    chatbot_helper = ChatbotHelper()\n",
    "    chatbot_helper.run_interface()"
   ],
   "metadata": {
    "id": "kSO7JdKNJmLO"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "_Ch7S-Yh1EHa"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "4CYx1qm01ELi"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "A_l2r-BK1EO5"
   },
   "execution_count": null,
   "outputs": []
  },
  {
   "cell_type": "code",
   "source": [],
   "metadata": {
    "id": "wduFx2DH1ERt"
   },
   "execution_count": null,
   "outputs": []
  }
 ]
}